library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(polycor)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr) ; library(kableExtra)
library(foreach) ; library(doParallel)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}


Load Dataset


clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)

# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(Module = clusterings$Module, gene.score = as.character(gene.score)) %>%
                      mutate(gene.score = ifelse(gene.score=='Others', 'None', gene.score)) %>%
          dplyr::select(-matches(clustering_selected))
rownames(dataset) = rownames_dataset

# Fix any Gene Significance that is NA
GS_missing = rownames(dataset)[is.na(dataset$GS)]
if(length(GS_missing)>0){
  # Gandal dataset
  load('./../Data/preprocessed_data.RData')
  datExpr = datExpr %>% data.frame
  
  for(g in GS_missing){
    dataset$GS[rownames(dataset) == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
  }
}



# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)


rm(getinfo, mart, rownames_dataset, GO_annotations, g, GS_missing)


The features that will be considered for the classification model will be the ones WGCNA uses to identify significant modules and genes:



Filtering the 1229 genes that were not assigned to any cluster (represented as the gray cluster)

rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor)) %>% 
              dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=gene.score!='None') %>% dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']

rm(rm_cluster)


Summary of the changes made to the original WGCNA variables:


  • Using Module Membership variables instead of binary module membership

  • Including a new variable with the absolute value of GS

  • Removing genes assigned to the gray module (unclassified genes)

  • Adding the Objective variable: Binary label indicating if it’s in the SFARI dataset or not

table_info = new_dataset %>% head(5) %>% t 

table_info %>% kable(caption = '(Transposed) features and their values for the first rows of dataset', 
                     col.names = colnames(table_info)) %>% kable_styling(full_width = F)
(Transposed) features and their values for the first rows of dataset
ENSG00000005243 ENSG00000120533 ENSG00000143815 ENSG00000171100 ENSG00000104783
MTcor 0.4424234 -0.0984706 0.3024664 0.6701023 0.7384892
GS 0.5773088 -0.0237431 0.3774346 0.2971990 0.3979591
MM.BF9C00 0.2028340 0.0052310 0.1800021 0.1452121 -0.3284692
MM.00BFC9 0.1065637 0.0839777 -0.1666649 0.1538377 -0.2391496
MM.AFA200 0.4423617 0.1676907 -0.0038683 0.2103471 0.1308024
MM.E58705 0.0027059 -0.3349507 0.0711587 -0.0811087 -0.4392788
MM.E08B00 -0.3760851 -0.1632560 0.0605323 -0.0197791 -0.5277045
MM.FC61D4 -0.0544823 -0.2525168 0.1022925 0.0435103 -0.5213252
MM.3FB500 0.1291954 -0.0569388 0.4859888 0.0325485 -0.2167086
MM.B3A000 -0.1332334 -0.0747677 0.4443737 0.0315333 -0.2823882
MM.00B81C -0.5765599 -0.1572348 0.2402064 -0.1392950 -0.3490138
MM.D475FE -0.4445916 -0.2682057 0.1360999 -0.1823822 -0.2674744
MM.00ACFC -0.1967597 -0.0416715 -0.0802592 0.0360149 -0.3956282
MM.00B3F1 -0.0321090 0.0023633 -0.1356941 0.0050660 -0.3547119
MM.DD8D00 -0.2350489 -0.0554090 0.0031597 0.0013227 -0.5329711
MM.A18CFF -0.1905724 -0.1336299 -0.0890341 -0.0118575 -0.1752024
MM.00B6EB -0.0867906 -0.2067695 0.2890163 0.0244979 -0.4883838
MM.8BAB00 -0.3194082 -0.3348128 0.0573330 -0.0061313 -0.3607469
MM.F67866 0.4570729 0.3337056 0.0188559 0.1819581 0.2775737
MM.00B92C 0.2924135 0.3610755 -0.0592034 0.1252458 0.0929811
MM.F067EB 0.2027280 0.3694462 -0.1036561 -0.0058085 0.0237863
MM.FF6A98 0.1810865 0.7908053 0.0375337 0.1067465 0.2123466
MM.00BFC0 0.2253819 0.4273730 0.0121879 -0.0042816 0.2760350
MM.97A900 0.1561653 0.3376495 0.0118569 0.0939749 0.1736092
MM.00C1A8 -0.2378982 -0.0286340 -0.3987647 -0.0521491 0.0207022
MM.6F99FF -0.4094027 -0.0049252 -0.0864109 -0.1664777 -0.2141910
MM.00C088 -0.1757616 0.0734600 -0.1872669 -0.2062253 0.0337206
MM.F265E7 0.0065986 0.3254029 -0.2813548 -0.1347271 0.0410107
MM.8594FF 0.2083700 -0.0203515 -0.0096639 -0.0452216 -0.0189785
MM.E88521 0.0356380 0.0321391 -0.1663063 0.0206671 0.1855691
MM.F17E4F -0.1332558 -0.1450732 -0.3399961 -0.1666015 -0.1987667
MM.A989FF 0.0547847 -0.0667886 -0.2673984 0.0728371 -0.2247399
MM.00BF7C -0.2818799 -0.0189527 -0.1981017 -0.0997961 -0.5090871
MM.B79F00 -0.1407720 0.0301538 -0.2439954 -0.1730695 -0.3424545
MM.F8766D -0.3616455 0.0955863 -0.2075336 -0.0836141 -0.3627237
MM.00C08D -0.1953855 0.0523297 0.0182803 -0.0810452 -0.4950331
MM.CA9700 -0.2530715 0.1792905 -0.0411042 -0.1313780 -0.2736308
MM.12B700 -0.0081337 0.2257283 -0.0728992 -0.0024294 -0.4837948
MM.F47A5F 0.0300009 0.1255822 -0.2009321 -0.0620662 -0.4256735
MM.7A97FF 0.0289730 0.2194633 -0.1771092 -0.1006185 -0.3498225
MM.8F91FF -0.1016082 0.3605316 -0.1704235 -0.1018813 -0.2614673
MM.AAA300 -0.3593588 0.3445319 -0.2060498 -0.2073087 -0.3384852
MM.00C0B2 -0.4322810 -0.0014908 0.0232239 -0.1871182 -0.5133832
MM.FF699E -0.5400028 0.0131789 -0.2923564 -0.2008242 -0.5524178
MM.FF6C92 -0.5339287 -0.1257697 -0.2701026 -0.1997399 -0.4952239
MM.00AEFA -0.2965236 -0.1323664 -0.1833544 -0.1350772 -0.6476345
MM.F763E0 -0.2491569 -0.3295197 -0.0281898 -0.1692153 -0.5366964
MM.00BAE1 -0.4032885 -0.3067542 -0.2622019 -0.0714883 -0.5812910
MM.BB9D00 -0.0422668 -0.3003371 -0.1999015 -0.0044675 -0.5214764
MM.FF67A3 -0.2256742 -0.0661792 -0.2636865 -0.1852566 -0.5051823
MM.E96AF1 -0.3166600 -0.1916914 -0.1804780 -0.0861607 -0.5950869
MM.F962DC -0.3777242 -0.1667395 -0.2520746 -0.1881677 -0.6624694
MM.988EFF -0.2498993 0.0106017 -0.2793687 -0.0713302 -0.1146649
MM.00A6FF -0.1993460 0.0240648 -0.1148809 0.0676391 -0.2756561
MM.00B8E4 -0.5520773 -0.0547391 -0.1363095 -0.0565164 -0.4286961
MM.CE9500 -0.1609515 -0.1345064 -0.4217047 -0.1237520 -0.3140501
MM.FB61D8 -0.2932511 0.1083117 -0.3364179 -0.0219065 -0.2839791
MM.B783FF -0.2974870 0.2860397 -0.0980191 -0.0503530 -0.2233977
MM.E26EF7 -0.5113100 0.1188229 -0.1068845 -0.2788153 -0.3579975
MM.69B100 -0.2298232 0.0449709 0.2026789 0.0982788 -0.2298216
MM.FF66A8 -0.3989518 0.0755634 -0.1504222 -0.0643972 -0.3860728
MM.00BD62 -0.1729552 0.2750711 -0.1243154 -0.0531687 -0.2253124
MM.F564E3 -0.3476570 0.0763643 -0.2756661 -0.1129809 -0.4835281
MM.B086FF -0.1878060 0.4603246 0.0276628 -0.1079187 0.0142361
MM.D19400 0.0727199 0.3688540 -0.1141579 -0.0189332 0.0413982
MM.00B4EE -0.5450656 0.1864493 -0.0341702 -0.2816868 -0.1937616
MM.9CA700 -0.4494503 0.3100581 -0.0465373 -0.2117188 -0.2072888
MM.7FAE00 -0.6644798 -0.0175256 -0.1339424 -0.2141670 -0.4530663
MM.00BE69 -0.5168616 -0.2173137 -0.2689779 -0.2267018 -0.4187253
MM.CA7BFF -0.4463094 -0.0279121 -0.1433458 -0.1370850 -0.3197675
MM.00BFC4 -0.5470485 0.0640713 -0.2963974 -0.1840458 -0.4012375
MM.FB737A -0.4414642 -0.0671004 -0.2441290 -0.1454191 -0.1498019
MM.00BCD9 -0.3106060 0.2922975 -0.1616332 -0.0661583 0.1390339
MM.E38900 -0.1930604 0.1439304 -0.3236567 0.0736892 0.2320726
MM.00C1AD -0.4747362 0.2552492 -0.2051899 -0.1540265 0.0358658
MM.A1A600 -0.4491277 0.0560755 -0.1820801 -0.1603359 0.1906966
MM.00C093 0.0114395 0.3175949 -0.3442086 0.0210813 0.2395274
MM.C39A00 -0.3442988 0.3086722 -0.3146722 -0.1127118 -0.1510965
MM.71B000 -0.1330448 0.1280027 -0.3454240 0.0110359 -0.3073646
MM.00C19E -0.0741635 0.3614308 -0.2160107 -0.0171070 0.0603642
MM.61B200 -0.3514805 0.1742921 -0.3829723 -0.2152558 -0.1870129
MM.FF65AE -0.3330500 -0.1190687 -0.1404204 -0.0827042 -0.2005884
MM.00BA38 -0.4942832 -0.1013124 -0.2586962 -0.2634162 -0.0854188
MM.FF61C2 -0.4668372 0.0215100 -0.0837382 -0.3056798 0.2081470
MM.00B0F7 0.1646842 0.0127795 0.2250830 0.1693213 -0.0140641
MM.00B7E8 -0.0612206 -0.0303831 0.1814632 -0.0875443 0.0287663
MM.00C1A3 -0.0483858 0.1366949 0.2855431 0.0398994 -0.0201202
MM.00AAFE 0.0227165 -0.2023820 0.2524830 0.2088135 0.0119522
MM.85AC00 0.0061256 -0.4009625 0.0792651 0.0368513 -0.1960617
MM.FF62BD -0.0927128 -0.1826014 -0.3334727 -0.1099248 -0.2212789
MM.00C082 0.1041267 -0.2473727 -0.0955795 0.2148814 -0.2218935
MM.00C0BB -0.1458479 -0.4074851 -0.2923323 -0.1101215 -0.4726822
MM.FF64B3 0.0282703 0.1454924 0.0639416 0.1882577 -0.1656456
MM.00C098 0.3611121 0.0302799 0.4154337 0.3687353 0.3156191
MM.4CB400 0.1687191 -0.0136555 0.1369337 0.1302580 0.0469669
MM.EC823C 0.1550789 -0.2705582 0.2470630 0.0491684 0.0665862
MM.BE80FF -0.1073464 -0.3042955 0.3104152 -0.1006400 -0.2040876
MM.DA8E00 -0.1089267 -0.3241793 0.3809301 -0.0304677 -0.0618126
MM.00C0B7 0.1899928 -0.0274603 0.1494733 0.0902505 -0.0204246
MM.FF61C6 0.0297616 -0.1706768 0.3971426 0.1104492 -0.1645761
MM.00A8FF 0.0156691 0.0637373 -0.1064660 -0.0365778 -0.0951825
MM.00BF76 0.4200009 0.0631236 -0.2695408 -0.0075495 0.0222318
MM.57B300 0.3160370 0.0956074 -0.2353250 0.0804195 -0.1040287
MM.C47DFF 0.3492542 0.1234515 -0.1986536 0.0462888 0.1380416
MM.00BE70 0.3568708 0.1524166 0.2122622 0.2493883 0.3599293
MM.A6A500 0.2977785 -0.0848181 0.2690949 0.2188868 0.3919065
MM.E66CF4 0.4939955 -0.0858880 0.2854911 0.3298311 -0.0044453
MM.00BDD1 0.3066574 -0.4521548 0.3015644 0.1544207 0.0660067
MM.D79000 0.3793051 -0.1864413 0.2956418 0.2382298 0.1501115
MM.2FB600 0.3873994 0.0459319 -0.0863881 0.1819079 0.2997601
MM.00BA42 0.3832956 -0.1980647 0.0685090 0.1830081 0.3634365
MM.00BBDD 0.3507585 -0.2051920 0.0448363 0.1541154 0.3150954
MM.519EFF 0.6059363 -0.0548627 -0.0788740 0.1271046 0.1769314
MM.D973FC 0.4343148 -0.0933457 0.1100902 0.0707750 0.2898581
MM.1CA3FF 0.4425080 0.3415437 0.1764281 0.1152422 0.5650341
MM.FC7180 0.6285117 0.2763286 0.0486198 0.1393682 0.4521191
MM.3DA1FF 0.5196423 0.1285667 0.3973143 0.3131387 0.4912240
MM.00BC53 0.4824763 0.0161265 0.2181999 0.1392386 0.7263569
MM.F97474 0.6706025 -0.0557581 0.2585743 0.2719067 0.4718147
MM.00BB4B 0.1543094 -0.3003355 -0.1706211 0.0673905 -0.2451155
MM.C79800 0.1953528 -0.1888684 -0.1029606 -0.0136535 -0.0646709
MM.FF63B8 0.0963193 -0.2125327 0.0201263 -0.0311881 0.0605314
MM.91AA00 0.1627080 -0.3292171 0.0043751 -0.0210109 0.0633165
MM.FD6F87 -0.0372128 -0.2900231 0.1654432 -0.0642405 -0.0760999
MM.D49200 0.4134427 -0.1147911 -0.0653478 -0.0227014 0.2015598
MM.00B1F4 0.3078544 0.1639288 -0.2519442 0.0806853 0.1795769
MM.FE61CF 0.0792379 -0.1282545 -0.1255512 -0.0636317 -0.1208749
MM.00BECD 0.1085551 -0.1029562 0.2770233 0.1041330 0.0190377
MM.619CFF -0.1708758 -0.1749386 0.1226747 -0.0307776 -0.1512764
MM.FF61CB 0.3490322 -0.0955637 0.4088728 0.0712598 0.2812172
MM.DE71F9 0.1715873 -0.1544436 0.1726366 0.0895199 0.3314373
MM.FE6D8D 0.3822037 -0.1991151 0.1385091 0.1601654 0.3896527
MM.EA8330 0.0014862 0.0244402 0.0910143 0.1026041 0.2180614
MM.00BCD5 0.0924869 0.1185616 0.1325512 0.1098747 0.4917774
MM.CF78FF 0.1787290 0.0912183 0.2183808 0.1515708 0.6464378
MM.78AF00 0.2182495 -0.1894561 0.3969925 0.0659376 0.2350151
MM.EF8046 0.2144135 -0.3344156 0.3234092 0.1166416 0.2331736
MM.00BD5B -0.3848673 -0.1725760 -0.0106865 -0.1631064 0.1276680
MM.ED68EE 0.0235976 -0.0306141 -0.0575470 -0.0406956 0.4087291
MM.F37C57 0.0984699 -0.0202135 0.1100042 -0.0598583 0.5159349
absGS 0.5773088 0.0237431 0.3774346 0.2971990 0.3979591
SFARI 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
rm(table_info)
original_dataset = dataset
dataset = new_dataset

rm(new_dataset)

The final dataset contains 11933 observations (genes) and 142 variables



Exploratory Analysis


PCA of Variables


The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS are in the middle of both groups (just like with Gandal’s dataset)

pca = dataset %>% mutate(SFARI = as.numeric(SFARI)) %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))


mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
         scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
         ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, pca)


PCA of Samples


  • The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module

  • Mean Expression doesn’t seem to play an important role

  • SFARI Genes seem to be evenly distributed everywhere (perhaps they have a slightly higher distribution in the 2nd principal component?)

  • It’s not clear what the 2nd principal component is capturing

# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)



Dividing samples into Training and Test Sets


5.49% of the observations are positive. This can be a problem when training the classification model, so the samples in the training set should be balanced between classes before the model is trained.

table_info = dataset %>% apply_labels(SFARI = 'SFARI')

cro(table_info$SFARI)
 #Total 
 SFARI 
   FALSE  11278
   TRUE  655
   #Total cases  11933
rm(table_info)

To divide our samples into training and test sets:

set.seed(123)

sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>% 
                left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                          by = 'ID') %>% 
                mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]


rm(sample_scores, train_idx)


Label distribution in training set


To fix this class imbalance, we are going to use SMOTE, an over-sampling technique that over-samples the minority class (SFARI Genes) by creating synthetic examples, in the training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  7895
   TRUE  460
   #Total cases  8355


Labels distribution in test set


This set is used just to evaluate how well the model performs, so the class imbalance is not a problem here

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  3383
   TRUE  195
   #Total cases  3578


Logistic Regression


Train model

# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5


train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)

k_fold = 10
cv_repeats = 5
smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats, 
                                   verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                   summaryFunction = twoClassSummary, sampling = 'smote')

# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)

# There is some perfect multicollinearity that doesn't let us do the vif analysis, so I'll remove those variables (you cannot use alias in caret::train, so I had to train the model again directly with glm)
ld.vars = attributes(alias(glm(SFARI~., data = train_set, family = 'binomial'))$Complete)$dimnames[[1]]

# Remove the linearly dependent variables variables
formula.new = as.formula( paste('SFARI ~ .', paste(ld.vars, collapse='-'), sep='-') )

# Retrain model without these variables
fit = caret::train(formula.new, data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)


rm(smote_over_sampling, ld.vars, formula.new, k_fold, cv_repeats)


Performance


The model has an AUC of 0.6865

But the features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable, so perhaps it would be better to use another model

# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
                       'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + 
              scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + 
              xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

rm(plot_data)

Correlation plot

corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, 
               tl.pos = 'l', tl.col = '#666666')


Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again




Ridge Regression


Notes:

### DEFINE FUNCTIONS

create_train_test_sets = function(p, seed){
  
  # Get SFARI Score of all the samples so our train and test sets are balanced for each score
  sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
                  left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                             by = 'ID') %>% 
                  mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

  set.seed(seed)
  train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
  
  train_set = dataset[train_idx,]
  test_set = dataset[-train_idx,]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
}



run_model = function(p, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(seed)
  k_fold = 10
  cv_repeats = 5
  smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats,
                                     verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                     summaryFunction = twoClassSummary, sampling = 'smote')
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
              tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type = 'prob')
  preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 'AUC' = AUC, 'preds' = preds, 'coefs' =coefs))
}


### RUN MODEL

# Parameters
p = 0.75

n_iter = 25
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  prec = c(prec, model_output[['prec']])
  rec = c(rec, model_output[['rec']])
  F1 = c(F1, model_output[['F1']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% 
                                                                      preds$ID, c('prob','pred','n')] +
                                                                    update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)


rm(p, seeds, update_preds, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% 
           left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  8816 2452   11268
   TRUE  346 308   654
   #Total cases  9162 2760   11922
rm(conf_mat)


Accuracy: Mean = 0.7619 SD = 0.0093


Precision: Mean = 0.1091 SD = 0.0075


Recall: Mean = 0.468 SD = 0.0343


F1 score: Mean = 0.1769 SD = 0.0119


ROC Curve: Mean = 0.6789 SD = 0.0151

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')


Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


MTcor has a very small coefficient, Gene Significance has a negative coefficient and absGS a positive one

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% 
                 left_join(assigned_module, by ='ID') %>% mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% 
            left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% 
            summarise('SFARI_perc' = mean(SFARI)) %>% arrange(desc(coef))

coef_info %>% dplyr::select(feature, coef) %>% filter(feature %in% c('Intercept','GS','absGS','MTcor')) %>%
              dplyr::rename('Feature' = feature, 'Coefficient' = coef) %>% 
              kable(align = 'cc', 
                    caption = 'Regression Coefficients (filtering MM features)') %>% 
              kable_styling(full_width = F)
Regression Coefficients (filtering MM features)
Feature Coefficient
absGS 0.9070567
GS 0.1444500
MTcor -0.1667778
Intercept -0.8236450


There is a positive relation between the coefficient assigned to the membership of each module and the enrichment (using ORA) in SFARI genes that are assigned to that module

load('./../Data/ORA.RData')

enrichment_SFARI_info = data.frame('Module'=as.character(), 'SFARI_enrichment'=as.numeric())
for(m in names(enrichment_SFARI)){
  m_info = enrichment_SFARI[[m]]
  enrichment = 1-ifelse('SFARI' %in% m_info$ID, m_info$pvalue[m_info$ID=='SFARI'],1)
  enrichment_SFARI_info = enrichment_SFARI_info %>% 
                          add_row(Module = gsub('#','',m), SFARI_enrichment = enrichment)
}

plot_data = coef_info %>% dplyr::rename('Module' = feature) %>% 
            left_join(enrichment_SFARI_info, by = 'Module') %>% filter(!is.na(MTcor))

ggplotly(plot_data %>% ggplot(aes(coef, SFARI_enrichment)) + 
         geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
         geom_point(aes(id = Module), color = paste0('#',plot_data$Module), alpha=0.7) + 
         theme_minimal() + xlab('Coefficient') + 
         ylab('SFARI Genes Enrichment'))
rm(enrichment_old_SFARI, enrichment_DGN, enrichment_DO, enrichment_GO, enrichment_KEGG, enrichment_Reactome,
   m, m_info, enrichment)


There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)]), 
                         alpha = 0.7) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse Results


Score distribution by SFARI Label


SFARI genes have a higher score distribution than the rest, but the overlap is large

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))


Score distribution by SFARI Score


Even though we didn’t use the actual SFARI Scores to train the model, but instead we grouped them all together, there seems to be a statistically significant positive relation between the SFARI scores and the probability assigned by the model

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                       gene.score)) %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  133
   2  174
   3  347
   Neuronal  761
   Others  10507
   #Total cases  11922
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

comparisons = list(c('1','2'), c('2','3'), c('3','Neuronal'), c('Neuronal','Others'),
                   c('1','3'), c('3','Others'), c('2','Neuronal'),
                   c('1','Neuronal'), c('2','Others'), c('1','Others'))
increase = 0.08
base = 0.9
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)

plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .02) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal() + theme(legend.position = 'none')

rm(mean_vals, increase, base, pos_y_comparisons)


Genes with the highest Probabilities


  • Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:5)

  • High concentration of genes with a SFARI Score of 1

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>%
             mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                        gene.score)) %>%
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' =MTcor,
                           'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4), Probability = round(Probability, 4), 
                    GeneSignificance = round(GeneSignificance, 4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
                           gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set') %>% 
             kable_styling(full_width = F)
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
TMEM132B -0.2750 -0.3724 #7A97FF 0.8832 Others
BSN -0.1497 0.0246 #FF64B3 0.8808 Neuronal
KIAA1549L -0.1577 -0.2597 #00C0BB 0.8773 Others
NBEA -0.4373 -0.1190 #619CFF 0.8752 1
NCOR2 -0.3729 -0.4591 #E08B00 0.8711 Others
KIAA1244 0.1093 0.0597 #B3A000 0.8701 Others
MYT1L -0.1383 -0.3724 #7A97FF 0.8693 1
SYNGAP1 -0.3336 -0.4591 #E08B00 0.8672 1
CNKSR2 -0.0991 -0.5210 #6F99FF 0.8614 2
DMXL2 -0.2011 -0.1190 #619CFF 0.8605 3
TSHZ3 0.2316 0.4424 #FC7180 0.8585 1
MYCBP2 0.2869 0.1528 #00C0B7 0.8540 Neuronal
KIDINS220 -0.2188 -0.5173 #00C0B2 0.8531 Neuronal
KNDC1 -0.3398 -0.3455 #00C1A8 0.8485 Neuronal
ATP2A2 -0.3723 -0.1190 #619CFF 0.8463 Others
ARHGAP32 0.1858 0.2730 #4CB400 0.8444 3
IQSEC2 -0.5350 -0.4591 #E08B00 0.8433 1
SV2B -0.4331 -0.6179 #FF66A8 0.8423 Others
SLC8A2 -0.4879 -0.4591 #E08B00 0.8405 Others
SLC12A5 -0.4504 -0.6811 #FF6C92 0.8404 2
SYT1 -0.2322 0.1528 #00C0B7 0.8404 Neuronal
HCN1 -0.4515 -0.6811 #FF6C92 0.8390 Others
CREBBP 0.0222 -0.0315 #00C082 0.8379 1
RASAL2 -0.1669 0.2774 #EC823C 0.8374 Others
LMTK2 -0.4942 -0.4391 #00AEFA 0.8361 Others
PLXNA2 -0.2708 -0.5675 #00B8E4 0.8358 Others
ZBTB18 -0.3300 -0.3724 #7A97FF 0.8355 Neuronal
PTPRD 0.3052 0.2774 #EC823C 0.8352 Neuronal
CHRM3 -0.1005 0.2774 #EC823C 0.8327 3
RAPGEFL1 -0.2166 -0.4591 #E08B00 0.8319 Others
CLSTN1 -0.3309 0.0246 #FF64B3 0.8312 Others
KCNV1 -0.0576 -0.2597 #00C0BB 0.8281 Others
MTMR7 -0.4081 -0.7558 #FF699E 0.8276 Others
LRP6 0.2101 0.2774 #EC823C 0.8259 Neuronal
KLHL42 0.4124 0.0482 #BE80FF 0.8254 Others
CADM2 0.2976 0.1526 #8594FF 0.8249 3
HOMER1 0.0867 0.2877 #FF61C6 0.8240 3
GNB5 -0.3051 0.1526 #8594FF 0.8236 Others
ARHGEF7 -0.1097 -0.1190 #619CFF 0.8222 Neuronal
MARCH6 0.2202 0.1585 #DA8E00 0.8211 Others
CACNA1E -0.3543 -0.6179 #FF66A8 0.8204 2
ARID1A 0.3879 0.0597 #B3A000 0.8202 Others
NAV3 -0.3320 -0.1521 #E58705 0.8199 Others
EIF2AK1 -0.3676 -0.5740 #F962DC 0.8199 Others
FRMD4A 0.1135 0.0597 #B3A000 0.8191 Others
KCNB1 -0.2885 0.1526 #8594FF 0.8189 1
NLGN4X 0.4414 0.2774 #EC823C 0.8182 2
PCSK2 -0.2178 -0.5210 #6F99FF 0.8176 Neuronal
CLIP3 -0.2203 -0.2597 #00C0BB 0.8174 Others
ZBTB16 0.3058 0.2774 #EC823C 0.8141 3





Negative samples distribution


The objective of this model is to identify candidate SFARI genes. For this, we are going to focus on the negative samples (the non-SFARI genes)

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')

cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  8816
   TRUE  2452
   #Total cases  11268

2452 genes are predicted as ASD-related


negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
                 ggtitle('Probability distribution of the Negative samples in the Test Set') + 
                 theme_minimal()




Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a positive relation with the Gene Significance (under-expressed genes having on average the lower probabilities and over-expressed genes the highest) (this pattern was the opposite in Gandal’s dataset)
negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() + 
                 geom_smooth(method = 'loess', color = '#666666') +
                 geom_hline(yintercept = 0, color='gray', linetype='dashed') + 
                 xlab('Probability') + ylab('Gene Significance') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()




Probability and Module-Diagnosis correlation


  • There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model

  • The model seems to assign slightly higher probabilities to genes belonging the modules with high module-Dianosis correlations (both positive and negative ones) than to genes belonging to modules with low correlations

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + 
                 geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()




Summarised version, plotting by module instead of by gene


The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

Again, the model seems to give lower probabilities to genes belonging to modules with a low (absolute) correlation to Diagnosis than to the rest

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% 
            left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Module') + theme_minimal())




Probability and level of expression


# Gupta dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame

The relation between mean level of expression of the genes and the probability assigned by the model is a lot weaker than in Gandal’s dataset, but it’s still there

mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) +  
              geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') + 
              xlab('Mean Expression') + ylab('Probability') + 
              ggtitle('Mean expression vs model probability by gene') +
              theme_minimal()

rm(mean_and_sd)

Grouping the genes by module we see there may be a small non-linear relation between mean level of expression and model probability, but again, it’s still not as strong as in Gandal’s dataset

plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + 
         geom_point(color=plot_data2$Module, alpha = 0.6) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') + 
         xlab('Mean Level of Expression') + ylab('Average Model Probability') +
         theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)




Probability and LFC


There is a relation between probability and LFC, so it IS capturing a bit of true information (because LFC and mean expression were negatively correlated and it still has a positive relation in the model)

  • The relation is stronger in over-expressed in ASD (opposite to the behaviour found in Gandal’s dataset)
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('LFC') +
              xlab('LFC') + ylab('Probability') +
              theme_minimal() + ggtitle('LFC vs model probability by gene')

  • The relation is stronger in Differentially Expressed genes
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>%
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.2) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + 
     ylim(c(min(plot_data$prob), max(plot_data$prob))) + 
     theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% 
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.2) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
     scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
     theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')

rm(p1, p2)



Conclusion


  • The model doesn’t seem to be capturing the level of expression of the genes as strongly as it did in Gandal’s dataset. This would mean that performing some sort of bias correction is not as necessary for this dataset

  • Even though the mean expression bias is not as strong, we can still see some statistically significant differences in probabilities by SFARI Gene Scores, which suggests that this bias not only comes from the level of expression, but from something else as well

  • It seems to be capturing true biological signals (based on the GS and the log fold change plots)


Saving results

predictions = test_set %>% left_join(gene_names, by = c('ID' = 'ensembl_gene_id'))

save(predictions, dataset, fit, file='./../Data/Ridge_model.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DMwR_0.4.1         doParallel_1.0.15  iterators_1.0.12   foreach_1.5.0     
##  [5] kableExtra_1.1.0   expss_0.10.2       corrplot_0.84      MLmetrics_1.1.1   
##  [9] car_3.0-7          carData_3.0-3      ROCR_1.0-7         gplots_3.0.3      
## [13] caret_6.0-86       lattice_0.20-41    polycor_0.7-10     biomaRt_2.40.5    
## [17] ggpubr_0.2.5       magrittr_1.5       RColorBrewer_1.1-2 gridExtra_2.3     
## [21] viridis_0.5.1      viridisLite_0.3.0  plotly_4.9.2       knitr_1.28        
## [25] forcats_0.5.0      stringr_1.4.0      dplyr_1.0.0        purrr_0.3.4       
## [29] readr_1.3.1        tidyr_1.1.0        tibble_3.0.1       ggplot2_3.3.2     
## [33] tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.1.0            RSQLite_2.2.0              
##   [3] AnnotationDbi_1.46.1        htmlwidgets_1.5.1          
##   [5] BiocParallel_1.18.1         pROC_1.16.2                
##   [7] munsell_0.5.0               codetools_0.2-16           
##   [9] miniUI_0.1.1.1              withr_2.2.0                
##  [11] colorspace_1.4-1            Biobase_2.44.0             
##  [13] highr_0.8                   rstudioapi_0.11            
##  [15] stats4_3.6.3                ggsignif_0.6.0             
##  [17] TTR_0.23-6                  labeling_0.3               
##  [19] GenomeInfoDbData_1.2.1      bit64_0.9-7                
##  [21] farver_2.0.3                vctrs_0.3.1                
##  [23] generics_0.0.2              ipred_0.9-9                
##  [25] xfun_0.12                   R6_2.4.1                   
##  [27] GenomeInfoDb_1.20.0         locfit_1.5-9.4             
##  [29] bitops_1.0-6                DelayedArray_0.10.0        
##  [31] assertthat_0.2.1            promises_1.1.0             
##  [33] scales_1.1.1                nnet_7.3-14                
##  [35] ggExtra_0.9                 gtable_0.3.0               
##  [37] timeDate_3043.102           rlang_0.4.6                
##  [39] genefilter_1.66.0           splines_3.6.3              
##  [41] lazyeval_0.2.2              ModelMetrics_1.2.2.2       
##  [43] acepack_1.4.1               broom_0.5.5                
##  [45] checkmate_2.0.0             yaml_2.2.1                 
##  [47] reshape2_1.4.4              abind_1.4-5                
##  [49] modelr_0.1.6                crosstalk_1.1.0.1          
##  [51] backports_1.1.8             quantmod_0.4.17            
##  [53] httpuv_1.5.2                Hmisc_4.4-0                
##  [55] tools_3.6.3                 lava_1.6.7                 
##  [57] ellipsis_0.3.1              BiocGenerics_0.30.0        
##  [59] Rcpp_1.0.4.6                plyr_1.8.6                 
##  [61] base64enc_0.1-3             progress_1.2.2             
##  [63] zlibbioc_1.30.0             RCurl_1.98-1.2             
##  [65] prettyunits_1.1.1           rpart_4.1-15               
##  [67] zoo_1.8-8                   S4Vectors_0.22.1           
##  [69] SummarizedExperiment_1.14.1 haven_2.2.0                
##  [71] cluster_2.1.0               fs_1.4.0                   
##  [73] data.table_1.12.8           openxlsx_4.1.4             
##  [75] reprex_0.3.0                matrixStats_0.56.0         
##  [77] hms_0.5.3                   mime_0.9                   
##  [79] evaluate_0.14               xtable_1.8-4               
##  [81] XML_3.99-0.3                rio_0.5.16                 
##  [83] jpeg_0.1-8.1                readxl_1.3.1               
##  [85] shape_1.4.4                 IRanges_2.18.3             
##  [87] compiler_3.6.3              KernSmooth_2.23-17         
##  [89] crayon_1.3.4                htmltools_0.4.0            
##  [91] mgcv_1.8-31                 later_1.0.0                
##  [93] Formula_1.2-3               geneplotter_1.62.0         
##  [95] lubridate_1.7.4             DBI_1.1.0                  
##  [97] dbplyr_1.4.2                MASS_7.3-51.6              
##  [99] Matrix_1.2-18               cli_2.0.2                  
## [101] gdata_2.18.0                gower_0.2.1                
## [103] GenomicRanges_1.36.1        pkgconfig_2.0.3            
## [105] foreign_0.8-76              recipes_0.1.10             
## [107] xml2_1.2.5                  annotate_1.62.0            
## [109] webshot_0.5.2               XVector_0.24.0             
## [111] prodlim_2019.11.13          rvest_0.3.5                
## [113] digest_0.6.25               rmarkdown_2.1              
## [115] cellranger_1.1.0            htmlTable_1.13.3           
## [117] curl_4.3                    shiny_1.4.0.2              
## [119] gtools_3.8.2                lifecycle_0.2.0            
## [121] nlme_3.1-147                jsonlite_1.7.0             
## [123] fansi_0.4.1                 pillar_1.4.4               
## [125] fastmap_1.0.1               httr_1.4.1                 
## [127] survival_3.1-12             xts_0.12-0                 
## [129] glue_1.4.1                  zip_2.0.4                  
## [131] png_0.1-7                   glmnet_3.0-2               
## [133] bit_1.1-15.2                class_7.3-17               
## [135] stringi_1.4.6               blob_1.2.1                 
## [137] DESeq2_1.24.0               latticeExtra_0.6-29        
## [139] caTools_1.18.0              memoise_1.1.0